import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
First dowmload the data set from this link https://www.kaggle.com/code/sadkoktaybicici/credit-card-data-clustering-k-mean/data then import it in python.
#read the data
data_path = '/content/CC GENERAL.csv' #the path where you downloaded the data
df = pd.read_csv(data_path)
print('The shape of the dataset is:', df.shape)
The shape of the dataset is: (8950, 18)
in this part you need to check the data quality and assess any issues in the data as:
comment each issue you find
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8950 entries, 0 to 8949 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CUST_ID 8950 non-null object 1 BALANCE 8950 non-null float64 2 BALANCE_FREQUENCY 8950 non-null float64 3 PURCHASES 8950 non-null float64 4 ONEOFF_PURCHASES 8950 non-null float64 5 INSTALLMENTS_PURCHASES 8950 non-null float64 6 CASH_ADVANCE 8950 non-null float64 7 PURCHASES_FREQUENCY 8950 non-null float64 8 ONEOFF_PURCHASES_FREQUENCY 8950 non-null float64 9 PURCHASES_INSTALLMENTS_FREQUENCY 8950 non-null float64 10 CASH_ADVANCE_FREQUENCY 8950 non-null float64 11 CASH_ADVANCE_TRX 8950 non-null int64 12 PURCHASES_TRX 8950 non-null int64 13 CREDIT_LIMIT 8949 non-null float64 14 PAYMENTS 8950 non-null float64 15 MINIMUM_PAYMENTS 8637 non-null float64 16 PRC_FULL_PAYMENT 8950 non-null float64 17 TENURE 8950 non-null int64 dtypes: float64(14), int64(3), object(1) memory usage: 1.2+ MB
df.describe()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8949.000000 | 8950.000000 | 8637.000000 | 8950.000000 | 8950.000000 |
| mean | 1564.474828 | 0.877271 | 1003.204834 | 592.437371 | 411.067645 | 978.871112 | 0.490351 | 0.202458 | 0.364437 | 0.135144 | 3.248827 | 14.709832 | 4494.449450 | 1733.143852 | 864.206542 | 0.153715 | 11.517318 |
| std | 2081.531879 | 0.236904 | 2136.634782 | 1659.887917 | 904.338115 | 2097.163877 | 0.401371 | 0.298336 | 0.397448 | 0.200121 | 6.824647 | 24.857649 | 3638.815725 | 2895.063757 | 2372.446607 | 0.292499 | 1.338331 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.000000 | 0.019163 | 0.000000 | 6.000000 |
| 25% | 128.281915 | 0.888889 | 39.635000 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 383.276166 | 169.123707 | 0.000000 | 12.000000 |
| 50% | 873.385231 | 1.000000 | 361.280000 | 38.000000 | 89.000000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 856.901546 | 312.343947 | 0.000000 | 12.000000 |
| 75% | 2054.140036 | 1.000000 | 1110.130000 | 577.405000 | 468.637500 | 1113.821139 | 0.916667 | 0.300000 | 0.750000 | 0.222222 | 4.000000 | 17.000000 | 6500.000000 | 1901.134317 | 825.485459 | 0.142857 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
column_of_interest = 'BALANCE'
selected_data = df[[column_of_interest]].dropna()
bar_color = 'lightgreen'
edge_color = 'black'
annotation_color = 'blue'
plt.figure(figsize=(6, 4))
sns.histplot(selected_data[column_of_interest], bins=5, kde=False, color=bar_color, edgecolor=edge_color)
plt.xlabel(f'{column_of_interest}')
plt.ylabel('Count')
plt.title(f'Histogram of {column_of_interest}')
counts, bins = np.histogram(selected_data[column_of_interest], bins=5)
percentages = (counts / len(selected_data[column_of_interest])) * 100
for count, bin_edge, percentage in zip(counts, bins[:-1], percentages):
plt.annotate(f'{percentage:.3f}%', xy=(bin_edge + np.diff(bins)[0] / 2, count), xytext=(0, 1),
textcoords='offset points', ha='center', va='bottom', fontsize=8, color=annotation_color)
plt.tight_layout()
plt.show()
most customer have balance more than 2500 and less than 5000
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
column_of_interest = 'PURCHASES'
selected_data = df[[column_of_interest]].dropna()
bar_color = 'lightgreen'
edge_color = 'black'
annotation_color = 'blue'
plt.figure(figsize=(6, 4))
sns.histplot(selected_data[column_of_interest], bins=5, kde=False, color=bar_color, edgecolor=edge_color)
plt.xlabel(f'{column_of_interest}')
plt.ylabel('Count')
plt.title(f'Histogram of {column_of_interest}')
counts, bins = np.histogram(selected_data[column_of_interest], bins=5)
percentages = (counts / len(selected_data[column_of_interest])) * 100
for count, bin_edge, percentage in zip(counts, bins[:-1], percentages):
plt.annotate(f'{percentage:.3f}%', xy=(bin_edge + np.diff(bins)[0] / 2, count), xytext=(0, 1),
textcoords='offset points', ha='center', va='bottom', fontsize=8, color=annotation_color)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
column_of_interest = 'CASH_ADVANCE'
selected_data = df[[column_of_interest]].dropna()
bar_color = 'lightgreen'
edge_color = 'black'
annotation_color = 'blue'
plt.figure(figsize=(6, 4))
sns.histplot(selected_data[column_of_interest], bins=5, kde=False, color=bar_color, edgecolor=edge_color)
plt.xlabel(f'{column_of_interest}')
plt.ylabel('Count')
plt.title(f'Histogram of {column_of_interest}')
counts, bins = np.histogram(selected_data[column_of_interest], bins=5)
percentages = (counts / len(selected_data[column_of_interest])) * 100
for count, bin_edge, percentage in zip(counts, bins[:-1], percentages):
plt.annotate(f'{percentage:.3f}%', xy=(bin_edge + np.diff(bins)[0] / 2, count), xytext=(0, 1),
textcoords='offset points', ha='center', va='bottom', fontsize=8, color=annotation_color)
plt.tight_layout()
plt.show()
most 98.227 do cash advance
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
column_of_interest = 'BALANCE_FREQUENCY'
selected_data = df[[column_of_interest]].dropna()
bar_color = 'lightgreen'
edge_color = 'black'
annotation_color = 'blue'
plt.figure(figsize=(6, 4))
sns.histplot(selected_data[column_of_interest], bins=5, kde=False, color=bar_color, edgecolor=edge_color)
plt.xlabel(f'{column_of_interest}')
plt.ylabel('Count')
plt.title(f'Histogram of {column_of_interest}')
counts, bins = np.histogram(selected_data[column_of_interest], bins=5)
percentages = (counts / len(selected_data[column_of_interest])) * 100
for count, bin_edge, percentage in zip(counts, bins[:-1], percentages):
plt.annotate(f'{percentage:.3f}%', xy=(bin_edge + np.diff(bins)[0] / 2, count), xytext=(0, 1),
textcoords='offset points', ha='center', va='bottom', fontsize=8, color=annotation_color)
plt.tight_layout()
plt.show()
df.duplicated().sum()
0
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
for col in list(df.columns)[1:]:
plt.figure()
rcParams['figure.figsize'] = (20, 3)
fig, ax_box = plt.subplots()
plt.gca().set(xlabel=col, ylabel='Density')
sns.boxplot(x=col, linewidth=1.0, palette='Blues', data=df, ax=ax_box)
plt.show()
<Figure size 640x480 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
for col in list(df.columns)[1:]:
plt.figure()
rcParams['figure.figsize'] = (20, 3)
fig, ax_kde = plt.subplots()
plt.gca().set(xlabel=col, ylabel='Density')
sns.kdeplot(df[col], fill=True, palette='Blues', ax=ax_kde)
plt.show()
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
<Figure size 2000x300 with 0 Axes>
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr())
plt.show()
plt.figure(figsize=(20,10))
coorlation = df.iloc[:,1:].corr()
thresh = 0.6
sns.heatmap(coorlation[abs(coorlation) > thresh] , annot=True)
<Axes: >
1-
2-
3-
#make a copy for the original dataset
df_copy=df.copy()
df_copy.drop('CUST_ID', axis=1, inplace=True)
First issue
#solution
df_copy.isnull().sum()
BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 1 PAYMENTS 0 MINIMUM_PAYMENTS 313 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
df_copy['CREDIT_LIMIT'] = df_copy['CREDIT_LIMIT'].fillna(df_copy['CREDIT_LIMIT'].median())
#test
df_copy.isnull().sum()
BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 0 PAYMENTS 0 MINIMUM_PAYMENTS 313 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
Second issue
#solution
df_copy.drop('ONEOFF_PURCHASES' , axis=1 , inplace=True)
df_copy.drop('MINIMUM_PAYMENTS' , axis=1 , inplace=True)
df_log=df_copy.copy()
df_log.head()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 0.000000 | 12 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 0.222222 | 12 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 0.000000 | 12 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 0.000000 | 12 |
| 4 | 817.714335 | 1.000000 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 0.000000 | 12 |
What is the feature scaling technique that would use and why?
return to this section again and try another technique and see how that will impact your result
for more details on different methods for scaling check these links
Answer here:
from sklearn import preprocessing
df_copy1=df_copy
df_copy1.skew()
BALANCE 2.393386 BALANCE_FREQUENCY -2.023266 PURCHASES 8.144269 INSTALLMENTS_PURCHASES 7.299120 CASH_ADVANCE 5.166609 PURCHASES_FREQUENCY 0.060164 ONEOFF_PURCHASES_FREQUENCY 1.535613 PURCHASES_INSTALLMENTS_FREQUENCY 0.509201 CASH_ADVANCE_FREQUENCY 1.828686 CASH_ADVANCE_TRX 5.721298 PURCHASES_TRX 4.630655 CREDIT_LIMIT 1.522636 PAYMENTS 5.907620 PRC_FULL_PAYMENT 1.942820 TENURE -2.943017 dtype: float64
log transformation
col = ['BALANCE', 'PURCHASES',
'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
'CREDIT_LIMIT', 'PAYMENTS']
for c in col:
df_copy1[c] = df_copy1[c] + 1e-4
df_copy1[c] = np.log(df_copy1[c])
col = df_copy1.columns
for i in range(len(col)):
plt.figure(figsize=(20,5))
sns.histplot(data = df_copy1,x = col[i])
plt.show()
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler , RobustScaler
function draw tsne
def draw_tsne(data , c=None):
tsne = TSNE(n_components=2)
tsne_data = tsne.fit_transform(data)
plt.figure(figsize=(15,5))
plt.scatter(tsne_data[:, 0], tsne_data[:, 1],c=c , cmap='viridis')
plt.title('tsne Cluster Visualization')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()
draw_tsne(df_copy1)
standerd scaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_copy1)
robust scaler
scaler = RobustScaler()
data_scaled2 = scaler.fit_transform(df_copy1)
draw_tsne(data_scaled)
draw_tsne(data_scaled2)
def pca_without_kernel(data):
pca = PCA(n_components=2)
pca_data = pca.fit_transform(data)
plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1])
plt.title('PCA without Kernel')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
pca data without kernel
pca_without_kernel(df_copy1)
pca data with different 3 kernal
from sklearn.decomposition import KernelPCA
def pca_with_kernel_rbf(data):
kpca = KernelPCA(n_components=2, kernel='rbf')
kpca_data = kpca.fit_transform(data)
plt.figure(figsize=(8, 6))
plt.scatter(kpca_data[:, 0], kpca_data[:, 1])
plt.title('PCA with Kernel (rbf)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
pca_with_kernel_rbf(df_copy1)
from sklearn.decomposition import KernelPCA
def pca_with_kernel_poly(data):
kpca = KernelPCA(n_components=2, kernel='poly')
kpca_data = kpca.fit_transform(data)
plt.figure(figsize=(8, 6))
plt.scatter(kpca_data[:, 0], kpca_data[:, 1])
plt.title('PCA with Kernel (poly)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
pca_with_kernel_poly(df_copy1)
from sklearn.decomposition import KernelPCA
def pca_with_kernel_cosine(data):
kpca = KernelPCA(n_components=2, kernel='cosine')
kpca_data = kpca.fit_transform(data)
plt.figure(figsize=(8, 6))
plt.scatter(kpca_data[:, 0], kpca_data[:, 1])
plt.title('PCA with Kernel (cosine)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
return kpca_data
data_pca=pca_with_kernel_cosine(df_copy1)
Best pca with cosine kernal
data_pca.shape
(8950, 2)
1- Use the k means class that you implemented in the previous task to cluster this data 2- Use http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html and see if the difference in the result 3- Use elbow method to determine the K (plot the result using two plot one for distorion and another for inertia) 4- (Optionally) make a method that pick the best number of clusters for you 5- Using different techniques for scaling and comment on the result
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
def determine_k(data_scaled):
wcss = []
distortions=[]
sil=[]
for i in range(1,15):
km = KMeans(n_clusters=i , random_state=42)
km.fit(data_scaled)
wcss.append(km.inertia_)
distortions.append(sum(np.min(cdist(data_scaled, km.cluster_centers_, 'euclidean'), axis=1)**2) / data_scaled.shape[0])
if i > 1:
sil.append(silhouette_score(data_scaled, km.labels_, metric = 'euclidean'))
return wcss , distortions , sil
wcss , distortions , sil = determine_k(df_copy1)
plt.figure(figsize=(10,5))
plt.plot(range(1,15),wcss, marker='o', linestyle='--')
plt.title('inertia vs epochs')
plt.xlabel('number of clusters')
plt.ylabel('inertia')
plt.show()
plt.figure(figsize=(11,5))
plt.plot(range(1,15),distortions, marker='o', linestyle='--')
plt.title('distortion vs epochs')
plt.xlabel('number of clusters')
plt.ylabel('distortion')
plt.show()
plt.figure(figsize=(11,5))
plt.plot(range(2,15),sil, marker='o', linestyle='--')
plt.title('silhouette_score vs epochs')
plt.xlabel('number of clusters')
plt.ylabel('silhouette_score')
plt.show()
wcss
[2122353.025870178, 997702.9910813475, 493801.7906112459, 301795.10287595977, 180279.25477490854, 151318.14017593587, 133708.52134561515, 117374.162711114, 109376.12092573436, 102381.19408545103, 100205.00433687813, 93159.57653597393, 89347.42865137184, 83188.24049753523]
!pip install kneed
Collecting kneed Downloading kneed-0.8.5-py3-none-any.whl (10 kB) Requirement already satisfied: numpy>=1.14.2 in /usr/local/lib/python3.10/dist-packages (from kneed) (1.25.2) Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from kneed) (1.11.4) Installing collected packages: kneed Successfully installed kneed-0.8.5
from kneed import KneeLocator
kl = KneeLocator(range(1, 15), wcss, curve="convex", direction="decreasing")
kl.elbow
4
from kneed import KneeLocator
kl = KneeLocator(range(1, 15), distortions, curve="convex", direction="decreasing")
kl.elbow
4
including the previous information i choose 5 cluster
from sklearn.cluster import KMeans
kmeans1 = KMeans(n_clusters=5 )
kmeans1.fit(df_copy1)
KMeans(n_clusters=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=5)
dfnew = df_copy1.copy()
dfnew['cluster1'] = kmeans1.predict(df_copy1)
draw_tsne(df_copy1 , kmeans1.predict(df_copy1))
df_log.head()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 0.000000 | 12 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 0.222222 | 12 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 0.000000 | 12 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 0.000000 | 12 |
| 4 | 817.714335 | 1.000000 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 0.000000 | 12 |
after I applied all algorithm in the notebook i found k means the best one
inves_df = df_log.copy()
inves_df['Cluster'] = kmeans1.labels_
feature_columns = ['BALANCE','BALANCE_FREQUENCY','PURCHASES' ,
'INSTALLMENTS_PURCHASES','CASH_ADVANCE','PURCHASES_FREQUENCY','ONEOFF_PURCHASES_FREQUENCY',
'PURCHASES_INSTALLMENTS_FREQUENCY','CASH_ADVANCE_FREQUENCY',
'CASH_ADVANCE_TRX','PURCHASES_TRX','CREDIT_LIMIT','PAYMENTS','PRC_FULL_PAYMENT','TENURE']
clusters = inves_df['Cluster'].unique()
num_rows = len(feature_columns)
num_cols = len(clusters)
fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 7*num_rows))
for i, col in enumerate(feature_columns):
for j, cluster in enumerate(clusters):
data = inves_df[inves_df['Cluster'] == cluster][col]
# Plot histogram
sns.histplot(data, bins=5, ax=axes[i, j], stat='percent')
axes[i, j].set_title(f'Cluster {cluster}')
axes[i, j].set_xlabel(col)
axes[i, j].set_ylabel('Percent')
plt.tight_layout()
plt.show()
inves_df.groupby('Cluster')[feature_columns].mean()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Cluster | |||||||||||||||
| 0 | 762.631611 | 0.769844 | 881.062444 | 0.000000 | 0.000000 | 0.354988 | 0.354366 | 0.000311 | 0.000000 | 0.000000 | 7.725746 | 4424.077058 | 1199.907641 | 0.135739 | 11.557836 |
| 1 | 2340.086599 | 0.925295 | 659.884782 | 0.000000 | 2031.598844 | 0.275644 | 0.275540 | 0.000000 | 0.293253 | 6.843088 | 6.278954 | 4637.888600 | 1918.729797 | 0.057814 | 11.352428 |
| 2 | 2152.064324 | 0.882442 | 0.000000 | 0.042560 | 1989.715951 | 0.000000 | 0.000000 | 0.000122 | 0.272724 | 6.299559 | 0.001958 | 4029.009626 | 1653.844906 | 0.044346 | 11.318160 |
| 3 | 2749.022276 | 0.964807 | 1496.934539 | 697.550495 | 2076.241800 | 0.739190 | 0.272882 | 0.621509 | 0.282441 | 7.257453 | 24.798780 | 5353.666064 | 2682.909852 | 0.062290 | 11.523713 |
| 4 | 801.798310 | 0.859506 | 1488.982413 | 745.046102 | 0.000000 | 0.758072 | 0.227245 | 0.659107 | 0.000000 | 0.000000 | 22.981440 | 4393.620698 | 1503.322940 | 0.281572 | 11.654106 |
business meaning Customers in Cluster 0: they don't use CASH_ADVANCE_TRX ,cash_advance and installments_purchases and they have lowest balance
Customers in Cluster 1: highest balance,make moderate purchases, rarely use installment plans, but frequently rely on cash advances, they have near to zero PURCHASES_INSTALLMENTS_FREQUENCY
Customers in Cluster 2: Customers with a moderate balance with minimal purchasing activity but with frequent cash advances
Customers in Cluster 3: Customers have a relatively high balance and engage in significant purchasing activities, with a portion spent on installments, along with recurring cash advances
Customers in Cluster 4: Customers have moderate credit and frequent purchasing activities, especially one-time purchases and installments, with occasional cash advances.
data_pca1 = pd.DataFrame(data_pca)
data_pca1
| 0 | 1 | |
|---|---|---|
| 0 | -0.485378 | -0.165015 |
| 1 | 0.792745 | -0.101441 |
| 2 | -0.179049 | -0.353550 |
| 3 | 0.330530 | 0.025923 |
| 4 | -0.097485 | -0.474160 |
| ... | ... | ... |
| 8945 | -0.616313 | -0.125911 |
| 8946 | -0.622973 | -0.129284 |
| 8947 | -0.613403 | -0.177269 |
| 8948 | 0.857915 | -0.391939 |
| 8949 | 0.335955 | 0.153192 |
8950 rows × 2 columns
k means with pca data
from sklearn.cluster import KMeans
kmeans2 = KMeans(n_clusters=5 )
kmeans2.fit(data_pca1)
KMeans(n_clusters=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=5)
cluster_assignments = kmeans2.predict(data_pca1)
draw_tsne(data_pca1 , kmeans2.predict(data_pca1))
DBSCAN on log transform data
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np
k = 5
nbrs = NearestNeighbors(n_neighbors=k).fit(df_copy1)
distances, _ = nbrs.kneighbors(df_copy1)
k_distances = distances[:, -1]
k_distances_sorted = np.sort(k_distances)[::-1]
plt.figure(figsize=(10,10))
plt.plot(np.arange(len(k_distances_sorted[::-1])), k_distances_sorted[::-1])
plt.xlabel('Data Points (sorted by distance)')
plt.ylabel(f'{k}-distance')
plt.title(f'{k}-Distance Graph')
plt.ylim(0,3)
plt.show()
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=4, min_samples=500).fit(df_copy1)
draw_tsne(df_copy1 , c=clustering.labels_)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4, random_state=42)
gmm.fit(df_copy1)
labels = gmm.predict(df_copy1)
df_gaussian =df_copy1.copy()
df_gaussian['expectation clusters'] = labels
draw_tsne(df_copy1 , c=labels)
Isolation Forest
from sklearn.ensemble import IsolationForest
#Use isolation forest for anomaly detection
isolationforest = IsolationForest()
isolationforest.fit(df_copy1)
outlier_preds = isolationforest.predict(df_copy1)
anomalies = df_copy1[outlier_preds == -1]
kmeans = KMeans(n_clusters=5)
kmeans.fit(df_copy1)
labels = kmeans.labels_
draw_tsne(df_copy1, labels)
Before we start the training process we need to specify 3 paramters:
1- Linkage criteria : The linkage criterion determines the distance between two clusters
- Complete-Linkage Clustering
- Single-Linkage Clustering
- Average-Linkage Clustering
- Centroid Linkage Clustering
2- Distance function:
- Euclidean Distance
- Manhattan Distance
- Mahalanobis distance
3- Number of clusters
Use Dendograms to specify the optimum number of clusters
import scipy.cluster.hierarchy as shc
plt.figure(figsize=(10, 7))
plt.title("Counters Dendograms")
dend = shc.dendrogram(shc.linkage(y=... , method=...,metric=...),orientation='right') #fill y with your dataframe
#and method with linkage criteria
#and metric with distance function
import scipy.cluster.hierarchy as shc
plt.figure(figsize=(20, 20))
plt.title("Counters Dendograms")
dend = shc.dendrogram(shc.linkage(y=df_copy1 , method='complete',metric='euclidean'))
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
tsne = TSNE(n_components=2)
tsne_data = tsne.fit_transform(df_copy1)
from sklearn.cluster import AgglomerativeClustering
n_clusters = np.arange(3,6)
inertia = []
for i in n_clusters:
agg = AgglomerativeClustering(n_clusters=i, linkage='complete', affinity='euclidean').fit(df_copy1)
print("seliouette score for linkage = {} and distance = {} and n_clusters = {} is {}".format('complete', 'euclidean', i, silhouette_score(df_copy1, agg.labels_)))
sns.scatterplot(x=tsne_data[:,:1].reshape(-1),y = tsne_data[:,1:].reshape(-1),hue=agg.labels_,palette='viridis')
plt.title('linkage = {} and distance = {}'.format('complete', 'euclidean'))
silhouette_score(df_copy1, agg.fit_predict(df_copy1))
plt.show()
seliouette score for linkage = complete and distance = euclidean and n_clusters = 3 is 0.613880263469066
seliouette score for linkage = complete and distance = euclidean and n_clusters = 4 is 0.6545578234374232
seliouette score for linkage = complete and distance = euclidean and n_clusters = 5 is 0.6623238130728172
tsne = TSNE(n_components=2)
tsne_data = tsne.fit_transform(df_copy1)
from sklearn.cluster import AgglomerativeClustering
n_clusters = np.arange(3,6)
inertia = []
for i in n_clusters:
agg = AgglomerativeClustering(n_clusters=i, linkage='average', affinity='manhattan').fit(df_copy1)
print("seliouette score for linkage = {} and distance = {} and n_clusters = {} is {}".format('average', 'manhattan', i, silhouette_score(df_copy1, agg.labels_)))
sns.scatterplot(x=tsne_data[:,:1].reshape(-1),y = tsne_data[:,1:].reshape(-1),hue=agg.labels_,palette='viridis')
plt.title('linkage = {} and distance = {}'.format('average', 'manhattan'))
silhouette_score(df_copy1, agg.fit_predict(df_copy1))
plt.show()
seliouette score for linkage = average and distance = manhattan and n_clusters = 3 is 0.6137830198848035
seliouette score for linkage = average and distance = manhattan and n_clusters = 4 is 0.5753685883988677
seliouette score for linkage = average and distance = manhattan and n_clusters = 5 is 0.56803031747135
#training
from sklearn.cluster import AgglomerativeClustering